a = "Hello @Text Mining World! I'm here to learn, right?"
a

"Hello @Text Mining World! I'm here to learn, right?"

import nltk
b = a.lower()
b

"hello @text mining world! i'm here to learn, right?"

import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

# Remember there are many ways to remove punctuations! This is only one of them:
c = "".join([char for char in b if char not in string.punctuation])
print(c)

hello text mining world im here to learn right

from nltk.tokenize import word_tokenize
print(word_tokenize(b))
print(word_tokenize(c))
# You might need to download Punkt Tokenizer Models
# In this case, run the code nltk.download('punkt')

['hello', '@', 'text', 'mining', 'world', '!', 'i', "'m", 'here', 'to', 'learn', ',', 'right', '?']
['hello', 'text', 'mining', 'world', 'im', 'here', 'to', 'learn', 'right']

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize(b)

['hello', 'text', 'mining', 'world', 'i', 'm', 'here', 'to', 'learn', 'right']

from nltk.tokenize import sent_tokenize
print(sent_tokenize(b))
print(sent_tokenize(c))

['hello @text mining world!', "i'm here to learn, right?"]
['hello text mining world im here to learn right']

import pandas as pd
ts_lyrics = pd.read_csv("data/taylor_swift_lyrics.csv")

ts_lyrics.head()

ts_lyrics.tail()

ts_lyrics.iloc[0]

Artist                                         Taylor Swift 
Album                                          Taylor Swift 
Title                                             Tim McGraw
Lyrics     He said the way my blue eyes shinx\nPut those ...
Name: 0, dtype: object

ts_lyrics.head(1)

import re
def remove_linebreaks(text):
    """custom function to remove the line breaks"""
    return re.sub(r'\n', ' ', text)

ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Lyrics"].apply(lambda text: remove_linebreaks(text))
ts_lyrics.head()

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', string.punctuation))

ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Preprocessed Lyrics"].apply(lambda text: remove_punctuation(text))
ts_lyrics.head()

ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Preprocessed Lyrics"].str.lower()
ts_lyrics.head()

from collections import Counter

# To get all lyrics in one text, you can concatenate all of them using the " ".join(list) syntax,
# which joins all elements in a list separating them by whitespace.
text = " ".join(lyric for lyric in ts_lyrics["Preprocessed Lyrics"])

# split() returns list of all the words in the string
split_it = text.split()

# Pass the split_it list to instance of Counter class.
Counter = Counter(split_it)

# most_common() produces k frequently encountered input values and their respective counts.
most_occur = Counter.most_common(20)

print(most_occur)

[('i', 2377), ('you', 2319), ('the', 1623), ('and', 1403), ('me', 885), ('to', 843), ('a', 787), ('in', 686), ('it', 674), ('my', 642), ('of', 492), ('your', 475), ('we', 441), ('that', 436), ('all', 436), ('but', 428), ('like', 406), ('im', 404), ('this', 394), ('know', 380)]

!pip install -q wordcloud

from wordcloud import WordCloud
?WordCloud

Init signature:
WordCloud(
    font_path=None,
    width=400,
    height=200,
    margin=2,
    ranks_only=None,
    prefer_horizontal=0.9,
    mask=None,
    scale=1,
    color_func=None,
    max_words=200,
    min_font_size=4,
    stopwords=None,
    random_state=None,
    background_color='black',
    max_font_size=None,
    font_step=1,
    mode='RGB',
    relative_scaling='auto',
    regexp=None,
    collocations=True,
    colormap=None,
    normalize_plurals=True,
    contour_width=0,
    contour_color='black',
    repeat=False,
    include_numbers=False,
    min_word_length=0,
    collocation_threshold=30,
)
Docstring:     
Word cloud object for generating and drawing.

Parameters
----------
font_path : string
    Font path to the font that will be used (OTF or TTF).
    Defaults to DroidSansMono path on a Linux machine. If you are on
    another OS or don't have this font, you need to adjust this path.

width : int (default=400)
    Width of the canvas.

height : int (default=200)
    Height of the canvas.

prefer_horizontal : float (default=0.90)
    The ratio of times to try horizontal fitting as opposed to vertical.
    If prefer_horizontal < 1, the algorithm will try rotating the word
    if it doesn't fit. (There is currently no built-in way to get only
    vertical words.)

mask : nd-array or None (default=None)
    If not None, gives a binary mask on where to draw words. If mask is not
    None, width and height will be ignored and the shape of mask will be
    used instead. All white (#FF or #FFFFFF) entries will be considerd
    "masked out" while other entries will be free to draw on. [This
    changed in the most recent version!]

contour_width: float (default=0)
    If mask is not None and contour_width > 0, draw the mask contour.

contour_color: color value (default="black")
    Mask contour color.

scale : float (default=1)
    Scaling between computation and drawing. For large word-cloud images,
    using scale instead of larger canvas size is significantly faster, but
    might lead to a coarser fit for the words.

min_font_size : int (default=4)
    Smallest font size to use. Will stop when there is no more room in this
    size.

font_step : int (default=1)
    Step size for the font. font_step > 1 might speed up computation but
    give a worse fit.

max_words : number (default=200)
    The maximum number of words.

stopwords : set of strings or None
    The words that will be eliminated. If None, the build-in STOPWORDS
    list will be used. Ignored if using generate_from_frequencies.

background_color : color value (default="black")
    Background color for the word cloud image.

max_font_size : int or None (default=None)
    Maximum font size for the largest word. If None, height of the image is
    used.

mode : string (default="RGB")
    Transparent background will be generated when mode is "RGBA" and
    background_color is None.

relative_scaling : float (default='auto')
    Importance of relative word frequencies for font-size.  With
    relative_scaling=0, only word-ranks are considered.  With
    relative_scaling=1, a word that is twice as frequent will have twice
    the size.  If you want to consider the word frequencies and not only
    their rank, relative_scaling around .5 often looks good.
    If 'auto' it will be set to 0.5 unless repeat is true, in which
    case it will be set to 0.

    .. versionchanged: 2.0
        Default is now 'auto'.

color_func : callable, default=None
    Callable with parameters word, font_size, position, orientation,
    font_path, random_state that returns a PIL color for each word.
    Overwrites "colormap".
    See colormap for specifying a matplotlib colormap instead.
    To create a word cloud with a single color, use
    ``color_func=lambda *args, **kwargs: "white"``.
    The single color can also be specified using RGB code. For example
    ``color_func=lambda *args, **kwargs: (255,0,0)`` sets color to red.

regexp : string or None (optional)
    Regular expression to split the input text into tokens in process_text.
    If None is specified, ``r"\w[\w']+"`` is used. Ignored if using
    generate_from_frequencies.

collocations : bool, default=True
    Whether to include collocations (bigrams) of two words. Ignored if using
    generate_from_frequencies.


    .. versionadded: 2.0

colormap : string or matplotlib colormap, default="viridis"
    Matplotlib colormap to randomly draw colors from for each word.
    Ignored if "color_func" is specified.

    .. versionadded: 2.0

normalize_plurals : bool, default=True
    Whether to remove trailing 's' from words. If True and a word
    appears with and without a trailing 's', the one with trailing 's'
    is removed and its counts are added to the version without
    trailing 's' -- unless the word ends with 'ss'. Ignored if using
    generate_from_frequencies.

repeat : bool, default=False
    Whether to repeat words and phrases until max_words or min_font_size
    is reached.

include_numbers : bool, default=False
    Whether to include numbers as phrases or not.

min_word_length : int, default=0
    Minimum number of letters a word must have to be included.

collocation_threshold: int, default=30
    Bigrams must have a Dunning likelihood collocation score greater than this
    parameter to be counted as bigrams. Default of 30 is arbitrary.

    See Manning, C.D., Manning, C.D. and Schütze, H., 1999. Foundations of
    Statistical Natural Language Processing. MIT press, p. 162
    https://nlp.stanford.edu/fsnlp/promo/colloc.pdf#page=22

Attributes
----------
``words_`` : dict of string to float
    Word tokens with associated frequency.

    .. versionchanged: 2.0
        ``words_`` is now a dictionary

``layout_`` : list of tuples ((string, float), int, (int, int), int, color))
    Encodes the fitted word cloud. For each word, it encodes the string,
    normalized frequency, font size, position, orientation, and color.
    The frequencies are normalized by the most commonly occurring word.
    The color is in the format of 'rgb(R, G, B).'

Notes
-----
Larger canvases make the code significantly slower. If you need a
large word cloud, try a lower canvas size, and set the scale parameter.

The algorithm might give more weight to the ranking of the words
than their actual frequencies, depending on the ``max_font_size`` and the
scaling heuristic.
File:           /opt/anaconda3/lib/python3.12/site-packages/wordcloud/wordcloud.py
Type:           type
Subclasses:

import matplotlib.pyplot as plt

wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

#nltk.download('stopwords')
from nltk.corpus import stopwords
# run the code nltk.download('stopwords') if needed
stop_words = set(stopwords.words('english'))
print(stop_words)

{'you', 'o', 'should', 'its', 'has', 'of', 'from', 'then', 'what', 'yours', "aren't", "you'll", 'any', "don't", 'own', 'who', 's', 'have', 'having', 'than', 'wouldn', 'by', 'her', 'itself', 'didn', 'few', 'while', 'he', "you've", 'ours', 'which', 'weren', 'if', 'between', 'mightn', 'under', 'and', 'against', 'our', 'ourselves', "shouldn't", 'we', 'how', 'isn', 'both', 'myself', 'a', 'more', 'now', 'during', 'y', 'hadn', 'further', 'in', 'hasn', 'did', 'was', 'are', 'off', 'be', 'before', 'nor', 't', "should've", 'hers', 'does', 'it', 'aren', 'again', 'where', 'she', 'm', 'can', 'haven', "wouldn't", "didn't", 'here', 'same', "wasn't", 'very', "haven't", "shan't", 'themselves', 'him', 'at', 'with', 'himself', 'up', 'herself', 'them', 'yourselves', 'doing', 'out', 'an', 'will', 'ma', 'but', "mightn't", 'each', 'were', "doesn't", 'about', 'over', 'won', 'because', 'their', "you're", 'above', "she's", 'some', "weren't", 'so', "it's", 'his', 'on', 'when', 'had', 'doesn', 'for', 'the', 'just', 'don', 'd', "couldn't", "you'd", "isn't", 'that', 'until', 'such', 'needn', 'your', 'yourself', 'into', 'below', 'do', 'being', 'most', 'been', 'as', 'those', "needn't", "hasn't", 'they', 'through', 'wasn', "hadn't", 'mustn', 'this', 'only', 'shouldn', 'whom', 'couldn', 'am', 'me', 'my', 'all', 'after', 'not', 'why', 'once', 'there', 'down', 'll', "won't", 'theirs', 'no', "that'll", 'i', 'ain', 've', 'or', 'to', 'these', 're', 'shan', 'too', 'other', "mustn't", 'is'}

stop_words.update(["im", "youre", "id", "dont", "cant", "didnt", "ive", "ill", "hasnt"])
# stop_words.discard('word') # this is when you want to remove a word from the list
print(stop_words)

{'you', 'o', 'should', 'its', 'has', 'of', 'from', 'then', 'what', 'yours', "aren't", "you'll", 'any', "don't", 'own', 'who', 's', 'have', 'having', 'than', 'wouldn', 'by', 'her', 'itself', 'didn', 'few', 'while', 'he', "you've", 'ours', 'which', 'weren', 'if', 'between', 'mightn', 'youre', 'under', 'hasnt', 'and', 'against', 'dont', 'our', 'didnt', 'ourselves', "shouldn't", 'we', 'how', 'isn', 'both', 'myself', 'a', 'more', 'now', 'during', 'y', 'hadn', 'further', 'in', 'hasn', 'did', 'was', 'are', 'off', 'be', 'before', 'nor', 't', 'ill', "should've", 'hers', 'does', 'it', 'cant', 'aren', 'again', 'where', 'she', 'm', 'can', 'haven', "wouldn't", "didn't", 'here', 'same', 'im', "wasn't", 'very', "haven't", "shan't", 'themselves', 'him', 'at', 'with', 'himself', 'up', 'herself', 'them', 'yourselves', 'doing', 'out', 'an', 'will', 'ma', 'but', "mightn't", 'each', 'were', "doesn't", 'about', 'over', 'won', 'because', 'their', "you're", 'above', "she's", 'some', "weren't", 'so', "it's", 'his', 'on', 'when', 'had', 'doesn', 'for', 'the', 'just', 'don', 'd', "couldn't", "you'd", 'id', "isn't", 'that', 'until', 'such', 'needn', 'your', 'yourself', 'into', 'below', 'do', 'being', 'most', 'been', 'as', 'those', "needn't", "hasn't", 'they', 'through', 'wasn', "hadn't", 'mustn', 'this', 'only', 'shouldn', 'whom', 'couldn', 'am', 'me', 'my', 'ive', 'all', 'after', 'not', 'why', 'once', 'there', 'down', 'll', "won't", 'theirs', 'no', "that'll", 'i', 'ain', 've', 'or', 'to', 'these', 're', 'shan', 'too', 'other', "mustn't", 'is'}

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in stop_words])

ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Preprocessed Lyrics"].apply(lambda text: remove_stopwords(text))
ts_lyrics.head()

from collections import Counter

# To get all lyrics in one text, you can concatenate all of them using the " ".join(list) syntax,
# which joins all elements in a list separating them by whitespace.
text = " ".join(lyric for lyric in ts_lyrics["Preprocessed Lyrics"])

# split() returns list of all the words in the string
split_it = text.split()

# Pass the split_it list to instance of Counter class.
Counter = Counter(split_it)

# most_common() produces k frequently encountered input values and their respective counts.
most_occur = Counter.most_common(20)

print(most_occur)

[('like', 406), ('know', 380), ('oh', 322), ('never', 294), ('love', 246), ('back', 240), ('time', 224), ('cause', 213), ('one', 177), ('say', 176), ('see', 170), ('got', 159), ('wanna', 158), ('think', 153), ('baby', 153), ('come', 150), ('go', 149), ('want', 142), ('ever', 134), ('could', 133)]

wordcloud = WordCloud(max_words=50, background_color="white").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
# run the code nltk.download('wordnet') if needed
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Preprocessed Lyrics"].apply(lambda text: lemmatize_words(text))
ts_lyrics.head()

from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

# since we applied the lemmatization, we don't apply stemming; though you can try it!
# ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Preprocessed Lyrics"].apply(lambda text: stem_words(text))
# ts_lyrics.head()

from nltk.stem.snowball import SnowballStemmer
SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

from sklearn.feature_extraction.text import CountVectorizer # for bag of words feature extraction

# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
vectorizer1 = CountVectorizer(max_features = 3000)

# fit_transform() does two functions: First, it fits the model and learns the vocabulary;
# second, it transforms our data into feature vectors.
# The input to fit_transform should be a list of strings.
dtm = vectorizer1.fit_transform(ts_lyrics["Preprocessed Lyrics"])
print(dtm.shape)

(132, 2597)

dtm

<132x2597 sparse matrix of type '<class 'numpy.int64'>'
	with 10530 stored elements in Compressed Sparse Row format>

# we can convert it to a dataframe
dtm_df = dtm.toarray()
dtm_df = pd.DataFrame(dtm_df)
dtm_df.head()

# Take a look at the words in the vocabulary
vocab = vectorizer1.get_feature_names_out()
print(vocab[1:100])

['16' '16th' '45' '4am' 'aah' 'abigail' 'absent' 'absurd' 'accent'
 'accident' 'accused' 'ace' 'achilles' 'aching' 'acing' 'across' 'act'
 'acted' 'actress' 'actually' 'add' 'adjusting' 'admit' 'adore'
 'adventure' 'affair' 'afraid' 'afterglow' 'afternoon' 'age' 'ago' 'ah'
 'ahah' 'ahahah' 'ahead' 'ahh' 'aim' 'aint' 'air' 'airplane' 'aisle'
 'album' 'aligned' 'alive' 'alls' 'almost' 'alone' 'along' 'alpha'
 'already' 'alright' 'altar' 'always' 'ambition' 'amen' 'american'
 'americana' 'amnesia' 'amount' 'andi' 'ane' 'angel' 'angry' 'another'
 'answer' 'anthem' 'anther' 'anticipation' 'anybody' 'anymore' 'anyone'
 'anything' 'anyway' 'anywhere' 'apart' 'apartment' 'apology' 'applause'
 'archer' 'architect' 'arent' 'argue' 'arm' 'armor' 'around' 'arrowhead'
 'ash' 'aside' 'ask' 'asked' 'asking' 'asleep' 'assume' 'assumption' 'ate'
 'ateam' 'attached' 'attack' 'attitude']

from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
vectorizer2 = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer2.fit_transform(ts_lyrics["Preprocessed Lyrics"])

# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)

(132, 2597)

# Create TfidfVectorizer object
vectorizer3 = TfidfVectorizer(ngram_range=(1, 2))

# Generate matrix of word vectors
tfidf_matrix3 = vectorizer3.fit_transform(ts_lyrics["Preprocessed Lyrics"])

# Print the shape of tfidf_matrix
print(tfidf_matrix3.shape)

(132, 15016)

friends_theme_lyrics = "So no one told you life was going to be this way. Your job's a joke, you're broke, you're love life's DOA. It's like you're always stuck in second gear, When it hasn\'t been your day, your week, your month, or even your year. But, I\'ll be there for you, when the rain starts to pour. I\'ll be there for you, like I\'ve been there before. I\'ll be there for you, cause you\'re there for me too."
friends_theme_lyrics

"So no one told you life was going to be this way. Your job's a joke, you're broke, you're love life's DOA. It's like you're always stuck in second gear, When it hasn't been your day, your week, your month, or even your year. But, I'll be there for you, when the rain starts to pour. I'll be there for you, like I've been there before. I'll be there for you, cause you're there for me too."

friends_theme_lyrics = remove_punctuation(friends_theme_lyrics)
friends_theme_lyrics = friends_theme_lyrics.lower()
friends_theme_lyrics = remove_stopwords(friends_theme_lyrics)
friends_theme_lyrics = lemmatize_words(friends_theme_lyrics)
friends_theme_lyrics

'one told life going way job joke broke love life doa like always stuck second gear day week month even year rain start pour like cause'

friends_theme_lyrics_tf = vectorizer1.transform([friends_theme_lyrics])
friends_theme_lyrics_tf.shape
dtm.shape

(132, 2597)

from sklearn.metrics.pairwise import cosine_similarity

# compute and print the cosine similarity matrix
cosine_sim_dtm = cosine_similarity(dtm, friends_theme_lyrics_tf)

print(cosine_sim_dtm)

[[0.07295675]
 [0.05749499]
 [0.05668202]
 [0.099573  ]
 [0.09816136]
 [0.00975761]
 [0.16365771]
 [0.11501093]
 [0.02125256]
 [0.15038123]
 [0.07792865]
 [0.09200874]
 [0.17766726]
 [0.0360492 ]
 [0.0786839 ]
 [0.1062023 ]
 [0.23829304]
 [0.08566568]
 [0.15519271]
 [0.1352231 ]
 [0.03202563]
 [0.19158319]
 [0.09043166]
 [0.19051587]
 [0.10639904]
 [0.12562973]
 [0.13526614]
 [0.1340465 ]
 [0.10332549]
 [0.14529915]
 [0.08091962]
 [0.0428993 ]
 [0.05358677]
 [0.11510231]
 [0.03928371]
 [0.05463417]
 [0.0946985 ]
 [0.0745356 ]
 [0.24685715]
 [0.07198268]
 [0.09507654]
 [0.11511347]
 [0.11136921]
 [0.10401235]
 [0.1946593 ]
 [0.15567091]
 [0.21439196]
 [0.13088543]
 [0.11021668]
 [0.09369712]
 [0.11888042]
 [0.06581261]
 [0.00903711]
 [0.21465394]
 [0.22794562]
 [0.04007421]
 [0.06975801]
 [0.05602768]
 [0.01563873]
 [0.10146346]
 [0.13488377]
 [0.1500909 ]
 [0.0521599 ]
 [0.16455472]
 [0.20490974]
 [0.17563692]
 [0.13237606]
 [0.02857238]
 [0.03055662]
 [0.17989569]
 [0.0790393 ]
 [0.00461099]
 [0.09170196]
 [0.02086808]
 [0.03288424]
 [0.11242975]
 [0.044955  ]
 [0.02726372]
 [0.18975469]
 [0.06574775]
 [0.08736843]
 [0.07787518]
 [0.24627294]
 [0.04908068]
 [0.13145637]
 [0.05978084]
 [0.14187609]
 [0.15555556]
 [0.04961695]
 [0.05384297]
 [0.09147674]
 [0.03362627]
 [0.09035781]
 [0.05615828]
 [0.07207214]
 [0.0340633 ]
 [0.16397832]
 [0.05627802]
 [0.09116057]
 [0.06624405]
 [0.07803834]
 [0.06365683]
 [0.04996305]
 [0.09610043]
 [0.10304734]
 [0.19756782]
 [0.01443376]
 [0.1750503 ]
 [0.18845876]
 [0.05396298]
 [0.17025131]
 [0.11426298]
 [0.10506787]
 [0.22829105]
 [0.08403295]
 [0.06714701]
 [0.05614346]
 [0.13715477]
 [0.01830783]
 [0.13255879]
 [0.07392213]
 [0.08475223]
 [0.20016019]
 [0.09798273]
 [0.10425721]
 [0.07647191]
 [0.04792568]
 [0.23490916]
 [0.0564445 ]
 [0.11111111]
 [0.08512565]
 [0.05363453]]

import numpy as np
max_index = np.argmax(cosine_sim_dtm, axis=0)
print(cosine_sim_dtm[max_index])
max_index

[[0.24685715]]

array([38])

ts_lyrics.iloc[max_index]

ts_lyrics["Preprocessed Lyrics"].iloc[38]

'used think one day wed tell story u met spark flew instantly people would say theyre lucky one used know place spot next searching room empty seat cause lately even know page oh simple complication miscommunications lead fall many thing wish knew many wall break standing alone crowded room speaking dying know killing like killing yeah know say since twist fate broke story u look lot like tragedy next chapter howd end way see nervously pulling clothes trying look busy best avoid starting think one day tell story u losing mind saw held pride like held oh scared see ending pretending nothing tell miss know never heard silence quite loud standing alone crowded room speaking dying know killing like killing yeah know say since twist fate broke story u look lot like tragedy looking like contest act like care le liked better side battle hand would lay armor youd say youd rather love fight many thing wish knew story u might ending soon standing alone crowded room speaking dying know killing like killing yeah know say since twist fate broke story u look lot like tragedy end'

friends_theme_lyrics_tfidf = vectorizer3.transform([friends_theme_lyrics])
print(friends_theme_lyrics_tfidf.shape)
print(tfidf_matrix3.shape)
# compute and print the cosine similarity matrix
cosine_sim_tfidf = cosine_similarity(tfidf_matrix3, friends_theme_lyrics_tfidf)

print(cosine_sim_tfidf)

(1, 15016)
(132, 15016)
[[0.02369657]
 [0.01318075]
 [0.01128244]
 [0.03986478]
 [0.03108815]
 [0.00181568]
 [0.02572803]
 [0.02691028]
 [0.00512609]
 [0.04532813]
 [0.01615807]
 [0.00787449]
 [0.03826574]
 [0.0068687 ]
 [0.01320367]
 [0.01245819]
 [0.09768082]
 [0.03284433]
 [0.01891928]
 [0.05104409]
 [0.00801751]
 [0.045455  ]
 [0.02005361]
 [0.04590047]
 [0.0319897 ]
 [0.01850863]
 [0.02299573]
 [0.0238499 ]
 [0.01617267]
 [0.03525199]
 [0.02914826]
 [0.01417113]
 [0.01017535]
 [0.02329621]
 [0.01165122]
 [0.00883778]
 [0.0213434 ]
 [0.01468301]
 [0.052752  ]
 [0.01398688]
 [0.02316361]
 [0.02104993]
 [0.03310764]
 [0.01013489]
 [0.04137598]
 [0.04265813]
 [0.04249053]
 [0.03391019]
 [0.02922056]
 [0.01882679]
 [0.01167784]
 [0.01196412]
 [0.00448242]
 [0.03975051]
 [0.02942139]
 [0.02500672]
 [0.0149794 ]
 [0.01541409]
 [0.00149821]
 [0.01839457]
 [0.03851785]
 [0.02703587]
 [0.01044214]
 [0.01812665]
 [0.04455981]
 [0.03015706]
 [0.03756264]
 [0.00366365]
 [0.00325763]
 [0.0333111 ]
 [0.01114174]
 [0.00036564]
 [0.03036674]
 [0.00187995]
 [0.00755594]
 [0.01588253]
 [0.01206606]
 [0.02056343]
 [0.07499416]
 [0.01513767]
 [0.03379185]
 [0.01961567]
 [0.04624439]
 [0.00618851]
 [0.02567295]
 [0.01120954]
 [0.04119311]
 [0.01641921]
 [0.00560346]
 [0.00851409]
 [0.02050939]
 [0.0099525 ]
 [0.01615719]
 [0.00906767]
 [0.00936747]
 [0.01777334]
 [0.02409146]
 [0.02385133]
 [0.03111177]
 [0.01181365]
 [0.04510753]
 [0.0069002 ]
 [0.0124372 ]
 [0.00991338]
 [0.01700991]
 [0.0369523 ]
 [0.00665362]
 [0.02971692]
 [0.02632021]
 [0.01001789]
 [0.02780341]
 [0.01487108]
 [0.01907687]
 [0.03361355]
 [0.00807426]
 [0.00985324]
 [0.01145457]
 [0.01459391]
 [0.00972955]
 [0.03375572]
 [0.01890663]
 [0.01118828]
 [0.02691614]
 [0.01158261]
 [0.03914842]
 [0.01562798]
 [0.01257735]
 [0.02771936]
 [0.0127158 ]
 [0.03262289]
 [0.02177201]
 [0.00782077]]

max_index = np.argmax(cosine_sim_tfidf, axis=0)
print(cosine_sim_tfidf[max_index])
max_index

[[0.09768082]]

array([16])

ts_lyrics.iloc[max_index]

ts_lyrics["Preprocessed Lyrics"].iloc[16]

'upon time believe tuesday caught eye caught onto something hold onto night looked eye told loved kidding cause seems thing breaking almost never speak feel welcome anymore baby happened please tell cause one second perfect halfway door stare phone still called feel low feel nothing flashback said forever always oh rain bedroom everything wrong rain rain gone cause said forever always line say something way honest made run hide like scared little boy looked eye thought knew minute sure here everything coming nothing here silence cut core going thought knew minute anymore stare phone still called feel low feel nothing flashback said forever always oh rain bedroom everything wrong rain rain gone cause said forever always mean baby think oh back baby back forget everything back baby back forget everything cause rain bedroom everything wrong rain rain gone cause said forever always oh stare phone still called feel low feel nothing flashback said forever always rain bedroom everything wrong rain rain gone cause said forever always mean baby said forever always yeah'

	Artist	Album	Title	Lyrics
127	Taylor Swift	folklore	mad woman	What did you think I'd say to that?\nDoes a sc...
128	Taylor Swift	folklore	epiphany	Keep your helmet\nKeep your life, son\nJust a ...
129	Taylor Swift	folklore	betty	Betty, I won't make assumptions about why you ...
130	Taylor Swift	folklore	peace	Our coming of age has come and gone\nSuddenly ...
131	Taylor Swift	folklore	hoax	My only one\nMy smoking gun\nMy eclipsed sun\n...

	...	2592	2593	2596
0	...	0	2	0
1	...	1	1	0
2	...	0	0	0
3	...	0	0	0
4	...	0	0	1

Practical 2: Text Pre-processing¶

Text Mining, Transforming Text into Knowledge (202400006)¶

Pre-processing simple texts¶

Pre-processing a text corpus (dataset)¶

Text representation with Vector Space Model¶

	Artist	Album	Title	Lyrics
0	Taylor Swift	Taylor Swift	Tim McGraw	He said the way my blue eyes shinx\nPut those ...
1	Taylor Swift	Taylor Swift	Picture to Burn	State the obvious, I didn't get my perfect fan...
2	Taylor Swift	Taylor Swift	Teardrops on my Guitar	Drew looks at me,\nI fake a smile so he won't ...
3	Taylor Swift	Taylor Swift	A Place in This World	I don't know what I want, so don't ask me\n'Ca...
4	Taylor Swift	Taylor Swift	Cold As You	You have a way of coming easily to me\nAnd whe...

	...	2592	2593	2596
0	...	0	2	0
1	...	1	1	0
2	...	0	0	0
3	...	0	0	0
4	...	0	0	1

	...	2592	2593	2596
0	...	0	2	0
1	...	1	1	0
2	...	0	0	0
3	...	0	0	0
4	...	0	0	1

	...	2592	2593	2596
0	...	0	2	0
1	...	1	1	0
2	...	0	0	0
3	...	0	0	0
4	...	0	0	1